Data loading and structure

How the data look like

head(bike, n=4)
##            ride_id rideable_type          started_at            ended_at
## 1 726C3A99FFCAE10C  classic_bike 2022-08-18 18:08:30 2022-08-18 19:00:37
## 2 F1AC3AED5E7498FB  classic_bike 2022-08-11 18:28:21 2022-08-11 18:44:35
## 3 9C93876268A75FD7  classic_bike 2022-08-28 19:40:43 2022-08-28 20:52:50
## 4 45AFFC2B7A7BD7C9  classic_bike 2022-08-15 20:21:00 2022-08-15 20:44:37
##                 start_station_name start_station_id
## 1 Grandview Library at Oakland Ave               79
## 2          Jaeger St & Whittier St               59
## 3           High St & Crestview Rd               88
## 4           High St & Crestview Rd               88
##                   end_station_name end_station_id start_lat start_lng  end_lat
## 1 Grandview Library at Oakland Ave             79  39.98193 -83.04898 39.98193
## 2          Jaeger St & Whittier St             59  39.94460 -82.98950 39.94460
## 3           High St & Crestview Rd             88  40.02252 -83.01364 40.02252
## 4           High St & Crestview Rd             88  40.02252 -83.01364 40.02252
##     end_lng member_casual
## 1 -83.04898        member
## 2 -82.98950        member
## 3 -83.01364        member
## 4 -83.01364        casual

Data preparing

Dealing with NA values

print(paste(sum(is.na(bike)), "number of NA in the data"))
## [1] "2131 number of NA in the data"
sapply(bike, function(y) sum(length(which(is.na(y)))))
##            ride_id      rideable_type         started_at           ended_at 
##                  0                  0                  0                  0 
## start_station_name   start_station_id   end_station_name     end_station_id 
##                  0                870                  0               1231 
##          start_lat          start_lng            end_lat            end_lng 
##                  0                  0                 15                 15 
##      member_casual 
##                  0

Most of Na in the columns start_station_id and and end_station_id. However, since the name of the station is present I don’t think we should delete the whole row! and for further investigation I subset the NA values to track any pattern.

#subdset the nas in end_station_id
#navalues <- bike[bike$end_station_id[which(bike$end_station_id == NA),]]
#head(navalues)

We can see a repetition of end_station_name and start_station_name so lets check if the station names are present in the main data bike?

#check if the nas has equivilant in the main data
#station_name <- bike$start_station_name %in% navalues$start_station_name
station1 <- subset(bike, end_station_name == "Scioto Audubon Center", select=c(end_station_name, end_station_id))

#same_name<-sum(station_name[station_name == TRUE])

I created a boolian value to check if the name of station in the subset is present in the main dataset.

Change columns type

# change rideable_type/ member_casual to factor 
#there three numbers under the factor rideable_type needs to look into!
bike$rideable_type <- factor(bike$rideable_type)
bike$member_casual <- factor(bike$member_casual)

# split started_at/ ended_at to date column and time column
# change start_date/end_date to date type
bike$start_time <- format(as.POSIXct(bike$started_at), format = "%H:%M:%S") 
bike$end_time <- format(as.POSIXct(bike$ended_at), format = "%H:%M:%S") 
bike$start_date <- as.Date(bike$started_at)
bike$end_date <- as.Date(bike$ended_at)

# Check the structure again
str(bike)
## 'data.frame':    7416 obs. of  17 variables:
##  $ ride_id           : chr  "726C3A99FFCAE10C" "F1AC3AED5E7498FB" "9C93876268A75FD7" "45AFFC2B7A7BD7C9" ...
##  $ rideable_type     : Factor w/ 3 levels "classic_bike",..: 1 1 1 1 3 3 1 1 1 1 ...
##  $ started_at        : chr  "2022-08-18 18:08:30" "2022-08-11 18:28:21" "2022-08-28 19:40:43" "2022-08-15 20:21:00" ...
##  $ ended_at          : chr  "2022-08-18 19:00:37" "2022-08-11 18:44:35" "2022-08-28 20:52:50" "2022-08-15 20:44:37" ...
##  $ start_station_name: chr  "Grandview Library at Oakland Ave" "Jaeger St & Whittier St" "High St & Crestview Rd" "High St & Crestview Rd" ...
##  $ start_station_id  : num  79 59 88 88 88 88 55 55 88 54 ...
##  $ end_station_name  : chr  "Grandview Library at Oakland Ave" "Jaeger St & Whittier St" "High St & Crestview Rd" "High St & Crestview Rd" ...
##  $ end_station_id    : num  79 59 88 88 88 88 48 62 110 54 ...
##  $ start_lat         : num  40 39.9 40 40 40 ...
##  $ start_lng         : num  -83 -83 -83 -83 -83 ...
##  $ end_lat           : num  40 39.9 40 40 40 ...
##  $ end_lng           : num  -83 -83 -83 -83 -83 ...
##  $ member_casual     : Factor w/ 2 levels "casual","member": 2 2 2 1 1 1 1 2 2 1 ...
##  $ start_time        : chr  "18:08:30" "18:28:21" "19:40:43" "20:21:00" ...
##  $ end_time          : chr  "19:00:37" "18:44:35" "20:52:50" "20:44:37" ...
##  $ start_date        : Date, format: "2022-08-18" "2022-08-11" ...
##  $ end_date          : Date, format: "2022-08-18" "2022-08-11" ...

Data analysis & visualazation

How many users of each membership type we have?

bike %>% count(member_casual)
##   member_casual    n
## 1        casual 4069
## 2        member 3347
ggplot(bike, aes(member_casual, fill = member_casual))+
  geom_bar()+
  scale_fill_brewer(palette = "BuPu")+
  guides(fill="none")+
  labs(title = "User membersip types", x= "types of memebership")+
  theme_classic()

There are more causal (24-hour pass or 3-day pass user) than annual members users by around 1000 difference in August 2022. There are two types of the Causal member which are Single trip cost 2.25 per 30m and 8 for unlimited 30m ride in a day, annual membership on the other hand cost 85$ a year.

What is the most frequent bike type used?

ggplot(bike, aes(rideable_type, fill = rideable_type))+
  geom_bar()+
  scale_fill_brewer(palette = "BuPu")+
  guides(fill="none")+
  labs(title = "Types of used biks", x = "")+
  theme_classic()

There are few users of docked bike type comparing to the others. Docked bike is a bicycles that can be borrowed or rented from an automated station or “docking stations”. It is interesting why would people prefer other types above this type!

casual members prefer watch type? VS members

# grouping types of users and counting their used bike type without counting docked_bike because it is only 3 users
members_preferance <- bike %>% group_by(member_casual, rideable_type)%>%
  filter(rideable_type != "docked_bike")%>%
  summarise(used = n())
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
print(members_preferance)
## # A tibble: 4 × 3
## # Groups:   member_casual [2]
##   member_casual rideable_type  used
##   <fct>         <fct>         <int>
## 1 casual        classic_bike   1689
## 2 casual        electric_bike  2377
## 3 member        classic_bike   1712
## 4 member        electric_bike  1635
ggplot(members_preferance, aes(x= member_casual,y = used , fill = rideable_type))+
  geom_bar(position='dodge', stat='identity')+
  scale_fill_brewer(palette = "BuPu")+
  labs(title = "Most used bike type to user", x= "type of user", y="")+
  theme_classic()

While there is no huge difference between annual members in choosing classic or electric bikes, Casual members choose to use electric bikes over the classic by around 680 user.

contingency table between customer type and bike type

#the probability of each user to pick this type of bike
round(table(bike$member_casual, bike$rideable_type), 2)
##         
##          classic_bike docked_bike electric_bike
##   casual         1689           3          2377
##   member         1712           0          1635

While there is almost even number of the annual member to choose electric or classic bike,casual users are more likely to choose electric bike.

Which day of the week the serves is used more?

#extrat only the day and convert it to day of the week
bike$days <- format(bike$start_date, format = "%a")
#convert it to a factor and organize the days order
bike$days <- factor(bike$days, levels = c("Sat", "Sun", "Mon", "Tue", "Wed", "Thu", "Fri" ))


ggplot(bike, aes(days, fill = days))+
  geom_bar()+
  scale_fill_brewer(palette = "BuPu")+
  guides(fill="none")+
  labs(title = "Number of users in the days of the week", x="Days of the week")+
  theme_classic()

Saturdays and Wednesdays have the most number of users but overall there is no big difference between the days of the week in the count of users.

When is the highest-lowest time of use of the day?

#get only the hour from the time
bike$hour <- NA 
bike$hour <- hour(bike$started_at)
sum_hour <- bike %>%
            group_by(hour) %>%
            summarise(sum_hour = length(hour)) 

ggplot(sum_hour, aes(hour, sum_hour ))+
  geom_line(color = "#8C6BB1", size = 1) +
  geom_point(color = "#8C96C6", size = 2) +
  scale_x_continuous(breaks=seq(0,23,1))+
  labs(title="Use by hour", y = "")+
  theme_classic()

What is the hourly use of each day?

sum_hour <- bike %>%
            group_by(days, hour)%>% summarise(count = n())
## `summarise()` has grouped output by 'days'. You can override using the
## `.groups` argument.
ggplot(data = sum_hour, aes(x = hour, y = count,  color = days))+
  geom_point() + geom_line(aes(group = 1))+
  facet_grid(rows = vars(days))+         
  scale_color_manual(values=c("#BFD3E6", "#9EBCDA" ,"#8C96C6" ,"#8C6BB1", "#88419D", "#810F7C", "#4D004B"))+
  labs(title= "Use by day and hour")+
  scale_y_continuous(breaks=seq(0,130,50))+ 
  scale_x_continuous(breaks=seq(0,23,1))+
    theme(
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  )  

Where are the most used stations

#library(sf)
#ggplot()+ geom_sf(bike, aes(start_lat, start_lng))
#end_station$sum_att <- end_station%>% group_by(end_lat, end_lng) %>% summarise(count = length(end_lng))
#
#
#install.packages("mapview")
library(mapview)
#subset without the na 
end_station <- subset(bike, (!is.na(bike[,11])) & (!is.na(bike[,12])))
#if there is dincity change the color
#when chotching the point give me the name of the station 
#have the car for ohaio
mapview(bike, xcol = "start_lat", ycol = "start_lng", crs = 3735, grid = FALSE, lable = "Start Station")
mapview(end_station, xcol = "end_lat", ycol = "end_lng", crs = 3735, grid = FALSE)
#
#
#look <- st_as_sf(bike, coords = c("start_lat", "start_lng"),  crs = 4326)
#mapview(look, map.types = "Stamen.Toner") 
#library(ggmap)
#map_sf <- get_map('Ohio', zoom = 12, maptype = 'satellite')
#ggmap(map_sf) +
#  stat_density2d(data = bike, aes(x = start_lng, y = start_lat, fill = ..density..), geom = 'tile', contour = F, alpha = .5) +  
#  scale_fill_viridis()
library(sp)
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(mapview)

test.coords<-as.data.frame(cbind(c(runif(15,-180,-130),runif(5,160,180)),runif(20,40,60)))
test.sp <- SpatialPointsDataFrame(coords = cbind(test.coords$V1,test.coords$V2), data = test.coords,
                                  proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))

test_sf = st_as_sf(test.sp)

shift = function(x) {
    geom = st_geometry(x)
    st_geometry(x) = st_sfc(
        lapply(seq_along(geom), function(i) {
            geom[[i]][1] = ifelse(geom[[i]][1] < 0, geom[[i]][1] + 360, geom[[i]][1])
            return(geom[[i]])
        })
        , crs = st_crs(geom)
    )
    return(x)
}

mapview(shift(test_sf)) +
    mapview(test_sf, col.regions = "orange")

Is there a correlation between time and day of usage?

#glm(hour ~ days, data = bike, family = )

What is the average trip distance?

probopality of user membership and distance time

#y1 = contenuse, y2 = binary
#t.test(dis_time ~ member_casual, var.equal = FALSE)